Total Network Tools 2002

home *** CD-ROM | disk | FTP | other *** search

/ Total Network Tools 2002 / NextStepPublishing-TotalNetworkTools2002-Win95.iso / Archive / Offline Browsing / HTTrack.exe / data1.cab / Sources / src / htsparse.c < prev next >

Wrap

C/C++ Source or Header | 2001-04-28 | 90.3 KB | 1,933 lines

/* ------------------------------------------------------------ */ /* HTTrack Website Copier, Offline Browser for Windows and Unix Copyright (C) Xavier Roche and other contributors This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. Important notes: - We hereby ask people using this source NOT to use it in purpose of grabbing emails addresses, or collecting any other private information on persons. This would disgrace our work, and spoil the many hours we spent on it. Please visit our Website: http://www.httrack.com */ /* ------------------------------------------------------------ */ /* File: Main source */ /* DIRECT INCLUDE TO httrack.c */ /* Author: Xavier Roche */ /* ------------------------------------------------------------ */ #if HTS_ANALYSTE if (hts_htmlcheck(r.adr,(int)r.size,urladr,urlfil)) { #endif FILE* fp=NULL; // fichier Θcrit localement // et si level>0 char* adr=r.adr; // pointeur (on parcourt) char* lastsaved; // adresse du dernier octet sauvΘ + 1 if ( (opt.debug>1) && (opt.log!=NULL) ) { fspc(opt.log,"debug"); fprintf(opt.log,"scan file.."LF); test_flush; } // Indexing! #if HTS_MAKE_KEYWORD_INDEX if (opt.kindex) { if (index_keyword(r.adr,r.size,r.contenttype,savename,opt.path_html)) { if ( (opt.debug>1) && (opt.log!=NULL) ) { fspc(opt.log,"debug"); fprintf(opt.log,"indexing file..done"LF); test_flush; } } else { if ( (opt.debug>1) && (opt.log!=NULL) ) { fspc(opt.log,"debug"); fprintf(opt.log,"indexing file..error!"LF); test_flush; } } } #endif // Now, parsing if ((opt.getmode & 1) && (ptr>0)) { // rΘcupΘrer les html sur disque // crΘer le fichier html local HT_ADD_FOP; // Θcrire peu α peu le fichier } if (!error) { int detect_title=0; // dΘtection du title // char* in_media=NULL; // in other media type (real media and so..) int intag=0; // on est dans un tag int incomment=0; // dans un  is used somewhere else.. darn those browsers are dirty */ if (!strstr(adr,"-->")) { intag=0; incomment=0; intag_start_valid=0; } } #endif } //} } //else if (*adr==34) { // inquote=(inquote?0:1); //} else if (intag || inscript) { // nous sommes dans un tag/commentaire, tester si on recoit un tag int p_type=0; int p_nocatch=0; int p_searchMETAURL=0; // chercher ..URL=<url> int add_class=0; // ajouter .class char* p_flush=NULL; // ------------------------------------------------------------ // parsing ΘvolΘ // ------------------------------------------------------------ if (((isalpha((unsigned char)*adr)) || (*adr=='/') || (inscript) || (inscriptgen))) { // sinon pas la peine de tester.. /* caractΦre de terminaison pour "miniparsing" javascript=.. ? (ex: <a href="javascript:()" action="foo"> ) */ if (inscript_tag) { if (inscript_tag_lastc) { if (*adr == inscript_tag_lastc) { /* sortir */ inscript_tag=inscript=0; incomment=0; } } } // Construction index.html (sommaire) // Avant de tester les a href, // Ici on teste si l'on doit construire l'index vers le(s) site(s) miroir(s) if (!in_media) { if (opt.makeindex && (ptr>0)) { if (opt.getmode & 1) { // autorisation d'Θcrire if (!makeindex_done) { // autoriation d'Θcrire un index if (opt.depth == liens[ptr]->depth) { // on note toujours les premiers liens if (!detect_title) { p=strfield(adr,"title"); if (p) { if (*(adr-1)=='/') p=0; // /title } else { if (strfield(adr,"/html")) p=-1; // noter, mais sans titre else if (strfield(adr,"body")) p=-1; // noter, mais sans titre } } else p=0; if (p) { // ok center if (makeindex_fp==NULL) { verif_backblue(opt.path_html); // gΘnΘrer gif makeindex_fp=filecreate(fconcat(opt.path_html,"index.html")); if (makeindex_fp!=NULL) { fprintf(makeindex_fp,"<HTML>"CRLF); fprintf(makeindex_fp,""CRLF); fprintf(makeindex_fp,"<HEAD>"CRLF"<TITLE>"); fprintf(makeindex_fp,"Local index"); fprintf(makeindex_fp,"</TITLE>"CRLF"</HEAD>"CRLF"<BODY BACKGROUND=\"backblue.gif\"><H1 ALIGN=Center>"); fprintf(makeindex_fp,"<U>Index of locally available sites:</U>"CRLF"</H1>"CRLF"<BR><BR>"CRLF); fprintf(makeindex_fp,"<TABLE BORDER=\"0\" WIDTH=\"100%%\" CELLSPACING=\"1\" CELLPADDING=\"0\">"CRLF); fprintf(makeindex_fp,"<UL>"CRLF); } else makeindex_done=-1; // fait, erreur } if (makeindex_fp!=NULL) { char tempo[HTS_URLMAXSIZE*2]; char s[HTS_URLMAXSIZE*2]; char* a=NULL; char* b=NULL; s[0]='\0'; if (p>0) { a=strchr(adr,'>'); if (a!=NULL) { a++; while(is_space(*a)) a++; // sauter espaces & co b=strchr(a,'<'); // prochain tag } } if (lienrelatif(tempo,liens[ptr]->sav,concat(opt.path_html,"index.html"))==0) { detect_title=1; // ok dΘtectΘ pour cette page! makeindex_links++; // un de plus strcpy(makeindex_firstlink,tempo); // fprintf(makeindex_fp,"<TR>"CRLF"<TD BACKGROUND=\"fade.gif\">"CRLF); //fprintf(makeindex_fp,"<UL>"CRLF); fprintf(makeindex_fp,"<LI>"CRLF); if ((b==a) || (a==NULL) || (b==NULL)) { // pas de titre fprintf(makeindex_fp,"<A HREF=\"%s\">%s</A><BR>"CRLF,tempo,tempo); } else if ((b-a)<256) { b--; while(is_space(*b)) b--; strncpy(s,a,b-a+1); *(s+(b-a)+1)='\0'; fprintf(makeindex_fp,"<A HREF=\"%s\">%s</A><BR>"CRLF,tempo,s); } fprintf(makeindex_fp,"</LI>"CRLF); //fprintf(makeindex_fp,"</UL>"CRLF); fprintf(makeindex_fp,"</TD>"CRLF"</TR>"CRLF); } } } } else if (liens[ptr]->depth<opt.depth) { // on a sautΘ level1+1 et level1 if (makeindex_fp) { fprintf(makeindex_fp,"</UL>"CRLF); fprintf(makeindex_fp,"</TABLE>"CRLF); fprintf(makeindex_fp,"<BR>"CRLF"<BR>"CRLF"<BR>"CRLF"</BODY>"CRLF); fprintf(makeindex_fp,"<I><H6 ALIGN=\"RIGHT\">Mirror and index made by HTTrack Website Copier/"HTTRACK_VERSION" "HTTRACK_AFF_AUTHORS"</H6></I>"CRLF); fprintf(makeindex_fp,""CRLF); fprintf(makeindex_fp,""CRLF); if (makeindex_links == 1) { fprintf(makeindex_fp,"<meta HTTP-EQUIV=\"Refresh\" CONTENT=\"0; URL=%s\">"CRLF,makeindex_firstlink); } fprintf(makeindex_fp,"</HTML>"CRLF); fflush(makeindex_fp); fclose(makeindex_fp); // α ne pas oublier sinon on passe une nuit blanche makeindex_fp=NULL; usercommand(0,NULL,fconcat(opt.path_html,"index.html")); } makeindex_done=1; // ok c'est fait } } } } // if (opt.makeindex) } // FIN Construction index.html (sommaire) // Note: // Certaines pages ne respectent pas le html // notamment les guillements ne sont pas fixΘs // Nous sommes dans un tag, donc on peut faire un test plus // large pour pouvoi prendre en compte ces particularitΘs // α vΘrifier: ACTION, CODEBASE, VRML if (in_media) { if (strcmp(in_media,"RAM")==0) { // real media p=0; valid_p=1; } } else if (ptr>0) { /* pas premiΦre page 0 (primary) */ p=0; // saut pour le nom de fichier: adresse nom fichier=adr+p // ------------------------------ // dΘtection d'Θcriture JavaScript. // osons les obj.write et les obj.href=.. ! osons! // note: inscript==1 donc on sautera aprΦs les \" if (inscript) { if (inscriptgen) { // on est dΘja dans un objet gΘnΘrant.. if (*adr==scriptgen_q) { // fermeture des " ou ' if (*(adr-1)!='\\') { // non inscriptgen=0; // ok parsing terminΘ } } } else { char* a=NULL; char check_this_fking_line=0; // parsing code javascript.. char must_be_terminated=0; // caractΦre obligatoire de terminaison! int token_size; if (!(token_size=strfield(adr,".writeln"))) // dΘtection ...objet.write[ln]("code html")... token_size=strfield(adr,".write"); if (token_size) { a=adr+token_size; while(is_realspace(*a)) a++; // sauter espaces if (*a=='(') { // dΘbut parenthΦse check_this_fking_line=2; // α parser! must_be_terminated=')'; a++; // sauter ( } } // euhh ??? ??? /* else if (strfield(adr,".href")) { // dΘtection ...objet.href="... a=adr+5; while(is_realspace(*a)) a++; // sauter espaces if (*a=='=') { // ohh un Θgal check_this_fking_line=1; // α noter! must_be_terminated=';'; // et si t'as oubliΘ le ; tu sais pas coder a++; // sauter = } }*/ // on a un truc du genre instruction"code gΘnΘrΘ" dont on parse le code if (check_this_fking_line) { while(is_realspace(*a)) a++; if ((*a=='\'') || (*a=='"')) { // dΘpart de '' ou "" char *b; int ex=0; scriptgen_q=*a; // quote b=a+1; // dΘpart de la chaεne // vΘrifier forme ("code") et pas ("code"+var), ingΘrable do { a++; // caractΦre suivant if (*a==scriptgen_q) if (*(a-1)!='\\') // quote non slash ex=1; // sortie if ((*a==10) || (*a==13)) ex=1; } while(!ex); if (*a==scriptgen_q) { // fin du quote a++; while(is_realspace(*a)) a++; if (*a==must_be_terminated) { // parenthΦse fermante: ("..") // bon, on doit parser une ligne javascript // 1) si check.. ==1 alors c'est un nom de fichier direct, donc // on fixe p sur le saut nΘcessaire pour atteindre le nom du fichier // et le moteur se dΘbrouillera ensuite tout seul comme un grand // 2) si check==2 c'est un peu plus tordu car lα on gΘnΘre du // code html au sein de code javascript au sein de code html // dans ce cas on doit fixer un flag α un puis ensuite dans la boucle // on devra parser les instructions standard comme <a href etc // NOTE: le code javascript autogΘnΘrΘ n'est pas pris en compte!! // (et ne marche pas dans 50% des cas de toute facon!) if (check_this_fking_line==1) { p=(int) b-(int) adr; // calculer saut! } else { inscriptgen=1; // SCRIPTGEN actif adr=b; // jump } if ((opt.debug>1) && (opt.log!=NULL)) { char str[512]; str[0]='\0'; strncat(str,b,minimum((int) a-(int) b+1,32)); fspc(opt.log,"debug"); fprintf(opt.log,"active code (%s) detected in javascript: %s"LF,(check_this_fking_line==2)?"parse":"pickup",str); test_flush; } } } } } } } // fin detection code gΘnΘrant javascript vers html // ------------------------------ // analyse proprement dite, A HREF=.. etc.. if (!p) { // si dans un tag, et pas dans un script - sauf si on analyse un obj.write(".. if ((intag && (!inscript)) || inscriptgen) { if ( (*(adr-1)=='<') || (is_space(*(adr-1))) ) { // <tag < tag etc // <A HREF=.. pour les liens HTML p=rech_tageq(adr,"href"); if (p) { // href.. tester si c'est une bas href! if ((intag_start_valid) && check_tag(intag_start,"base")) { // oui! // ** note: base href et codebase ne font pas bon mΘnage.. p_type=2; // c'est un chemin } } /* Tags supplΘmentaires α vΘrifier (<img src=..> etc) */ if (p==0) { int i=0; while( (p==0) && (strnotempty(hts_detect[i])) ) { p=rech_tageq(adr,hts_detect[i]); i++; } } /* Tags supplΘmentaires α vΘrifier : URL=.. */ if (p==0) { int i=0; while( (p==0) && (strnotempty(hts_detectURL[i])) ) { p=rech_tageq(adr,hts_detectURL[i]); i++; } if (p) p_searchMETAURL=1; } /* Tags supplΘmentaires α vΘrifier, mais α ne pas capturer */ if (p==0) { int i=0; while( (p==0) && (strnotempty(hts_detectandleave[i])) ) { p=rech_tageq(adr,hts_detectandleave[i]); i++; } if (p) p_nocatch=1; /* ne pas rechercher */ } /* EvΘnements */ if (p==0) { int i=0; /* dΘtection onLoad etc */ while( (p==0) && (strnotempty(hts_detect_js[i])) ) { p=rech_tageq(adr,hts_detect_js[i]); i++; } /* non dΘtectΘ - dΘtecter Θgalement les onXxxxx= */ if (p==0) { if ( (*adr=='o') && (*(adr+1)=='n') && isUpperLetter(*(adr+2)) ) { p=0; while(isalpha((unsigned char)adr[p]) && (p<64) ) p++; if (p<64) { while(is_space(adr[p])) p++; if (adr[p]=='=') p++; else p=0; } else p=0; } } /* OK, ΘvΘnement repΘrΘ */ if (p) { inscript_tag_lastc=*(adr+p-1); /* α attendre α la fin */ adr+=p; /* saut */ /* On est dΘsormais dans du code javascript */ inscript_tag=inscript=1; } p=0; /* quoi qu'il arrive, ne rien dΘmarrer ici */ } // <APPLET CODE=.. pour les applet java.. [CODEBASE (chemin..) α faire] if (p==0) { p=rech_tageq(adr,"code"); if (p) { if ((intag_start_valid) && check_tag(intag_start,"applet")) { // dans un <applet ! p_type=-1; // juste le nom de fichier+dossier, Θcire avant codebase add_class=1; // ajouter .class au besoin // vΘrifier qu'il n'y a pas de codebase APRES // sinon on swappe les deux. // pas trΦs propre mais c'est ce qu'il y a de plus simple α faire!! { char *a; a=adr; while((*a) && (*a!='>') && (!rech_tageq(a,"codebase"))) a++; if (rech_tageq(a,"codebase")) { // banzai! codebase= char* b; b=strchr(a,'>'); if (b) { if (((int) b - (int) adr) < 1000) { // au total < 1Ko char tempo[HTS_URLMAXSIZE*2]; tempo[0]='\0'; strncat(tempo,a,(int) b - (int) a); strcat( tempo," "); strncat(tempo,adr,(int) a - (int) adr - 1); // Θventuellement remplire par des espaces pour avoir juste la taille while((int) strlen(tempo)<((int) b - (int) adr)) strcat(tempo," "); // pas d'erreur? if ((int) strlen(tempo) == ((int) b - (int) adr)) { strncpy(adr,tempo,strlen(tempo)); // PAS d'octet nul α la fin! p=0; // DEVALIDER!! p_type=0; add_class=0; } } } } } } } } // liens α patcher mais pas α charger (ex: codebase) if (p==0) { // note: si non chargΘ (ex: ignorer .class) patchΘ tout de mΩme p=rech_tageq(adr,"codebase"); if (p) { if ((intag_start_valid) && check_tag(intag_start,"applet")) { // dans un <applet ! p_type=-2; } else p=-1; // ne plus chercher } } // Meta tags pour robots if (p==0) { if (opt.robots) { if ((intag_start_valid) && check_tag(intag_start,"meta")) { if (rech_tageq(adr,"name")) { // name=robots.txt char tempo[1100]; char* a; tempo[0]='\0'; a=strchr(adr,'>'); #if DEBUG_ROBOTS printf("robots.txt meta tag detected\n"); #endif if (a) { if (((int) a - (int) adr) < 999 ) { strncat(tempo,adr,(int) a - (int) adr); if (strstrcase(tempo,"content")) { if (strstrcase(tempo,"robots")) { if (strstrcase(tempo,"nofollow")) { #if DEBUG_ROBOTS printf("robots.txt meta tag: nofollow in %s%s\n",urladr,urlfil); #endif nofollow=1; // NE PLUS suivre liens dans cette page if (opt.errlog) { fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Link %s%s not scanned (follow robots meta tag)"LF,urladr,urlfil); test_flush; } } } } } } } } } } // entrΘe dans une applet javascript /*if (!inscript) { // sinon on est dans un obj.write(".. if (p==0) if (rech_sampletag(adr,"script")) if (check_tag(intag_start,"script")) { inscript=1; } }*/ // Ici on procΦde α une analyse du code javascript pour tenter de rΘcupΘrer // certains fichiers Θvidents. // C'est devenu obligatoire vu le nombre de pages qui intΦgrent // des images rΘactives par exemple } } else if (inscript) { if (strfield(adr,"/script") ) { char* a=adr; //while(is_realspace(*(--a))); while( is_realspace(*a) ) a--; a--; if (*a=='<') { // s√r que c'est un tag? inscript=0; } } else { int nc; char expected = '='; // caractΦre attendu aprΦs char* expected_end = ";"; if (inscript_tag) expected_end=";\"\'"; // voir a href="javascript:doc.location='foo'" nc = strfield(adr,".src"); // nom.src="image"; if (!nc) nc = strfield(adr,".location"); // document.location="doc" if (!nc) nc = strfield(adr,".href"); // document.location="doc" if (!nc) if ( (nc = strfield(adr,".open")) ) { // window.open("doc",.. expected='('; // parenthΦse expected_end="),"; // fin: virgule ou parenthΦse } if (!nc) if ( (nc = strfield(adr,".replace")) ) { // window.replace("url") expected='('; // parenthΦse expected_end=")"; // fin: parenthΦse } if (!nc) if ( (nc = strfield(adr,".link")) ) { // window.link("url") expected='('; // parenthΦse expected_end=")"; // fin: parenthΦse } if (nc) { char *a; a=adr+nc; while(is_space(*a)) a++; if (*a == expected) { a++; while(is_realspace(*a)) a++; if ((*a==34) || (*a=='\'')) { char *b,*c; a++; b=a; while((*b!=34) && (*b!='\'') && (*b!='\0')) b++; c=b--; c++; while(*c==' ') c++; if ((strchr(expected_end,*c)) || (*c=='\n') || (*c=='\r')) { c-=2; if ((int) c-(int) a+1) { if ((opt.debug>1) && (opt.log!=NULL)) { char str[512]; str[0]='\0'; strncat(str,a,minimum((int) c-(int) a+1,32)); fspc(opt.log,"debug"); fprintf(opt.log,"link detected in javascript: %s"LF,str); test_flush; } p=(int) a- (int) adr; // p non nul: TRAITER CHAINE COMME FICHIER } } } } } } } } } else p=rech_tageq(adr,"primary"); // lien primaire, yeah } // ------------------------------------------------------------ // dernier recours - parsing "sale" : dΘtection systΘmatique des .gif, etc. // risque: gΘnΘrer de faux fichiers parazites // fix: ne parse plus dans les commentaires // ------------------------------------------------------------ if ( (opt.parseall) && (ptr>0) && (!in_media) ) { // option parsing "brut" int incomment_justquit=0; if (!is_realspace(*adr)) { int noparse=0; // Gestion des /* */ if (inscript) { if (parseall_incomment) { if ((*adr=='/') && (*(adr-1)=='*')) parseall_incomment=0; incomment_justquit=1; // ne pas noter dernier caractΦre } else { if ((*adr=='/') && (*(adr+1)=='*')) parseall_incomment=1; } } else parseall_incomment=0; /* vΘrifier que l'on est pas dans un  pur */ if ( (!intag) && (incomment) && (!inscript)) noparse=1; /* commentaire */ // recherche d'URLs if ((!parseall_incomment) && (!noparse)) { if (!p) { // non dΘja trouvΘ if (adr != r.adr) { // >1 caractΦre // scanner les chaines if ((*adr == '\"') || (*adr=='\'')) { // "xx.gif" 'xx.gif' if (strchr("=(,",parseall_lastc)) { // exemple: a="img.gif.. char *a=adr; char stop=*adr; // " ou ' int count=0; // sauter caractΦres a++; while((*a) && (*a!='\'') && (*a!='\"') && (count<HTS_URLMAXSIZE)) { count++; a++; } // ok chaine terminΘe par " ou ' if ((*a == stop) && (count<HTS_URLMAXSIZE) && (count>0)) { char c; char* aend; // aend=a; // sauver dΘbut a++; while(is_realspace(*a)) a++; c=*a; if (strchr("),;>/+",c)) { // exemple: ..img.gif"; // le / est pour funct("img.gif" /* URL */); char tempo[HTS_URLMAXSIZE*2]; char type[256]; int url_ok=0; // url valide? tempo[0]='\0'; type[0]='\0'; // strncat(tempo,adr+1,count); // if ((!strchr(tempo,' ')) || inscript) { // espace dedans: mΘfiance! (sauf dans code javascript) int invalid_url=0; // vΘrifier qu'il n'y a pas de caractΦres spΘciaux if (strchr(tempo,'*') || strchr(tempo,'<') || strchr(tempo,'>')) invalid_url=1; /* non invalide? */ if (!invalid_url) { // Un plus α la fin? Alors ne pas prendre sauf si extension ("/toto.html#"+tag) if (c!='+') { // PAS de plus α la fin char* a; if ((strncmp(tempo,"http://",7)==0) || (strncmp(tempo,"ftp://",6)==0)) // ok pas de problΦme url_ok=1; else if (tempo[strlen(tempo)-1]=='/') { // un slash: ok.. if (inscript) // sinon si pas javascript, mΘfiance (rΘpertoire style base?) url_ok=1; } else if ((a=strchr(tempo,'/'))) { // un slash: ok.. if (inscript) { // sinon si pas javascript, mΘfiance (style "text/css") if (strchr(a+1,'/')) // un seul / : abandon (STYLE type='text/css') url_ok=1; } } } // Prendre si extension reconnue if (!url_ok) { get_httptype(type,tempo,0); if (strnotempty(type)) // type reconnu! url_ok=1; } // // Ok, cela pourrait Ωtre une URL if (url_ok) { // Accepter URL, on la traitera comme une URL normale!! p=1; } } } } } } } } } // p == 0 // plus dans un commentaire if (!incomment_justquit) parseall_lastc=*adr; // caractΦre avant le prochain } // not in comment } // if realspace } // if parseall // ------------------------------------------------------------ // p!=0 : on a repΘrΘ un Θventuel lien // ------------------------------------------------------------ // if ((p>0) || (valid_p)) { // on a repΘrΘ un lien //int lien_valide=0; char* eadr=NULL; /* fin de l'URL */ char* quote_adr=NULL; /* adresse du ? dans l'adresse */ int ok=1; char quote='\0'; // TEST /*{ static int loop=0; if ((++loop)%5000==0) loop=0; }*/ // si nofollow a ΘtΘ dΘclenchΘ, rΘΘcrire tous les liens en externe if (nofollow) p_nocatch=1; // Θcrire codebase avant, flusher avant code if ((p_type==-1) || (p_type==-2)) { if ((opt.getmode & 1) && (ptr>0)) { HT_ADD_ADR; // refresh } lastsaved=adr; // dernier Θcrit+1 } // sauter espaces adr+=p; while((is_space(*adr)) && (quote=='\0')) { if (!quote) if ((*adr=='\"') || (*adr=='\'')) quote=*adr; // on doit attendre cela α la fin // puis quitter adr++; // sauter les espaces, "" et cie } /* s'arrΩter que ce soit un ' ou un " : pour document.write('<img src="foo'+a); par exemple! */ if (inscript) quote='\0'; // sauter Θventuel \" ou \' javascript if (inscript) { // on est dans un obj.write(".. if (*adr=='\\') { if ((*(adr+1)=='\'') || (*(adr+1)=='"')) { // \" ou \' adr+=2; // sauter } } } // sauter content="1;URL=http://.. if (p_searchMETAURL) { int l=0; while(!strfield(adr+l,"URL=") && (l<128) ) l++; if (!strfield(adr,"URL=")) ok=-1; else adr+=(l+4); } /* Θviter les javascript:document.location=.. : les parser, plut⌠t */ if (ok!=-1) { if (strfield(adr,"javascript:")) { ok=-1; /* On est dΘsormais dans du code javascript */ inscript_tag=inscript=1; inscript_tag_lastc=quote; /* α attendre α la fin */ } } if (p_type==1) { if (*adr=='#') { adr++; // sauter # pour usemap etc } } eadr=adr; // ne pas flusher aprΦs code si on doit Θcrire le codebase avant! if ((p_type!=-1) && (p_type!=2) && (p_type!=-2)) { if ((opt.getmode & 1) && (ptr>0)) { HT_ADD_ADR; // refresh } lastsaved=adr; // dernier Θcrit+1 // aprΦs on Θcrira soit les donnΘes initiales, // soir une URL/lien modifiΘ! } else if (p_type==-1) p_flush=adr; // flusher jusqu'α adr ensuite if (ok!=-1) { // continuer // dΘcouper le lien do { if ((* (unsigned char*) eadr)<32) { // caractΦre de contr⌠le (ou \0) if (!is_space(*eadr)) ok=0; } if ( ( ((int) eadr) - ((int) adr) ) > HTS_URLMAXSIZE) // ** trop long, >HTS_URLMAXSIZE caractΦres (on prΘvoit HTS_URLMAXSIZE autres pour path) ok=-1; // ne pas traiter ce lien if (ok) { //if (*eadr!=' ') { if (is_space(*eadr)) { // guillemets,CR, etc if ((!quote) || (*eadr==quote)) // si pas d'attente de quote spΘciale ou si quote atteinte ok=0; } else { switch(*eadr) { case '>': if (!quote) { if (!inscript) { intag=0; // PLUS dans un tag! intag_start_valid=0; } ok=0; } break; /*case '<':*/ case '#': ok=0; break; // case '?': non! case '\\': if (inscript) ok=0; break; // \" ou \' point d'arrΩt case '?': quote_adr=adr; break; // noter position query } } //} } eadr++; } while(ok==1); if ( (((int) eadr)-((int) adr)) <= 1) ok=-1; // lien vide } if (ok==0) { // tester un lien char lien[HTS_URLMAXSIZE*2]; int meme_adresse=0; // 0 par dΘfaut pour primary //char *copie_de_adr=adr; //char* p; // construire lien (dΘcoupage) if ( (((int) eadr)-((int) adr)-1) < HTS_URLMAXSIZE ) { // pas trop long? strncpy(lien,adr,((int) eadr)-((int) adr)-1); *(lien+ (((int) eadr)-((int) adr))-1 )='\0'; //printf("link: %s\n",lien); // supprimer les espaces while((lien[strlen(lien)-1]==' ') && (strnotempty(lien))) lien[strlen(lien)-1]='\0'; // supprimer les // en / (sauf pour http://) { char *a,*p,*q; int done=0; a=strstr(lien,":/"); // http:// if (a) { a++; while(*a=='/') a++; // position aprΦs http:// } else { a=lien; // dΘbut } q=strchr(a,'?'); // ne pas traiter aprΦs '?' if (!q) q=a+strlen(a)-1; while(( p=strstr(a,"//")) && (!done) ) { // remplacer // par / if ((int) p>(int) q) { // aprΦs le ? (toto.cgi?param=1//2.3) done=1; // stopper } else { char tempo[HTS_URLMAXSIZE*2]; tempo[0]='\0'; strncat(tempo,a,(int) p - (int) a); strcat (tempo,p+1); strcpy(a,tempo); // recopier } } } } else lien[0]='\0'; // erreur // ------------------------------------------------------ // Lien repΘrΘ et extrait if (strnotempty(lien)>0) { // construction du lien char adr[HTS_URLMAXSIZE*2],fil[HTS_URLMAXSIZE*2]; // ATTENTION adr cache le "vrai" adr int forbidden_url=-1; // lien non interdit (mais non autorisΘ..) int just_test_it=0; // mode de test des liens int set_prio_to=0; // pour capture de page isolΘe int import_done=0; // lien importΘ (ne pas scanner ensuite *α priori*) // adr[0]='\0'; fil[0]='\0'; // // 0: autorisΘ // 1: interdit (patcher tout de mΩme adresse) if ((opt.debug>1) && (opt.log!=NULL)) { fspc(opt.log,"debug"); fprintf(opt.log,"link detected in html: %s"LF,lien); test_flush; } // purger CR,LF rΘsiduels (IMG SRC="foo.<\n>gif") { char* a; while ((a=strchr(lien,'\n'))) { char tempo[HTS_URLMAXSIZE*2]; tempo[0]='\0'; strncat(tempo,lien,(int) a - (int) lien); strcat(tempo,a+1); strcpy(lien,tempo); } while ((a=strchr(lien,'\r'))) { char tempo[HTS_URLMAXSIZE*2]; tempo[0]='\0'; strncat(tempo,lien,(int) a - (int) lien); strcat(tempo,a+1); strcpy(lien,tempo); } } /* Unescape/escape %20 and other */ { char query[HTS_URLMAXSIZE*2]; char* a=strchr(lien,'?'); if (a) { strcpy(query,a); *a='\0'; } else query[0]='\0'; // conversion & -> & et autres joyeusetΘs unescape_amp(lien); // dΘcoder l'inutile (%2E par exemple) et coder espaces strcpy(lien,unescape_http(lien)); escape_spc_url(lien); strcat(lien,query); /* restore */ } // convertir les Θventuels \ en des / pour Θviter des problΦmes de reconnaissance! { char* a=jump_identification(lien); while( (a=strchr(a,'\\')) ) *a='/'; } // supprimer le(s) ./ while ((lien[0]=='.') && (lien[1]=='/')) { char tempo[HTS_URLMAXSIZE*2]; strcpy(tempo,lien+2); strcpy(lien,tempo); } if (strnotempty(lien)==0) // sauf si plus de nom de fichier strcpy(lien,"./"); // vΘrifie les /~machin -> /~machin/ // supposition dangereuse? if (lien[strlen(lien)-1]!='/') { char *a=lien+strlen(lien)-1; // Θviter aussi index~1.html while (((int) a>(int) lien) && (*a!='~') && (*a!='/') && (*a!='.')) a--; if (*a=='~') { strcat(lien,"/"); // ajouter slash } } // Θliminer les Θventuels :80 (port par dΘfaut!) { char* a; a=strstr(lien,"://"); if (a) a+=3; else a=lien; while((*a) && (*a!='/') && (*a!=':')) a++; if (*a==':') { // port int port=0; char* b=a+1; while(isdigit((unsigned char)*b)) { port*=10; port+=(int) (*b-'0'); b++; } if (port==80) { // port 80, default char tempo[HTS_URLMAXSIZE*2]; tempo[0]='\0'; strncat(tempo,lien,(int) a-(int) lien); strcat(tempo,a+3); // sauter :80 strcpy(lien,tempo); } } } // filtrer les parazites (mailto & cie) if (strfield(lien,"mailto:")) { // ne pas traiter error=1; } else if (strfield(lien,"news:")) { // ne pas traiter error=1; } // vΘrifier que l'on ne doit pas ajouter de .class if (!error) { if (add_class) { char *a = lien+strlen(lien)-1; while(((int) a > (int) lien) && (*a!='/') && (*a!='.')) a--; if (*a != '.') strcat(lien,".class"); // ajouter .class } } // si c'est un chemin, alors vΘrifier (toto/toto.html -> http://www/toto/) if (!error) { if ((opt.debug>1) && (opt.log!=NULL)) { fspc(opt.log,"debug"); fprintf(opt.log,"position link check %s"LF,lien); test_flush; } if ((p_type==2) || (p_type==-2)) { // code ou codebase // VΘrifier les codebase=applet (au lieu de applet/) if (p_type==-2) { // codebase if (strnotempty(lien)) { if (fil[strlen(lien)-1]!='/') { // pas rΘpertoire strcat(lien,"/"); } } } /* only one ending / (bug on some pages) */ if ((int)strlen(lien)>2) { while( (lien[strlen(lien)-2]=='/') && ((int)strlen(lien)>2) ) /* double // (bug) */ lien[strlen(lien)-1]='\0'; } // copier nom host si besoin est if (strstr(lien,"://")==NULL) { // pas de http:// char adr2[HTS_URLMAXSIZE*2],fil2[HTS_URLMAXSIZE*2]; // ** euh ident_url_relatif?? if (ident_url_relatif(lien,urladr,urlfil,adr2,fil2)<0) { error=1; } else { strcpy(lien,"http://"); strcat(lien,adr2); if (*fil2!='/') strcat(lien,"/"); strcat(lien,fil2); { char* a; a=lien+strlen(lien)-1; while((*a) && (*a!='/') && ((int) a> (int) lien)) a--; if (*a=='/') { *(a+1)='\0'; } } //char tempo[HTS_URLMAXSIZE*2]; //strcpy(tempo,"http://"); //strcat(tempo,urladr); // host //if (*lien!='/') // strcat(tempo,"/"); //strcat(tempo,lien); //strcpy(lien,tempo); } } if (!error) { // pas d'erreur? if (p_type==2) { // code ET PAS codebase char* a=lien+strlen(lien)-1; while( ((int) a > (int) lien) && (*a) && (*a!='/')) a--; if (*a=='/') // ok on a repΘrΘ le dernier / *(a+1)='\0'; // couper else { *lien='\0'; // Θliminer error=1; // erreur, ne pas poursuivre } } // stocker base ou codebase? switch(p_type) { case 2: { //if (*lien!='/') strcat(base,"/"); strcpy(base,lien); } break; // base case -2: { //if (*lien!='/') strcat(codebase,"/"); strcpy(codebase,lien); } break; // base } if ((opt.debug>1) && (opt.log!=NULL)) { fspc(opt.log,"debug"); fprintf(opt.log,"code/codebase link %s base %s"LF,lien,base); test_flush; } //printf("base code: %s - %s\n",lien,base); } } else { char* _base; if (p_type==-1) // code (applet) _base=codebase; else _base=base; // ajouter chemin de base href.. if (strnotempty(_base)) { // considΘrer base if (!strstr(lien,"://")) { // non absolue if (*lien!='/') { // non absolu sur le site (/) if ( ((int) strlen(_base)+(int) strlen(lien))<HTS_URLMAXSIZE) { char tempo[HTS_URLMAXSIZE*2]; // base est absolue strcpy(tempo,_base); strcat(tempo,lien); strcpy(lien,tempo); // patcher en considΘrant base // ** vΘrifier que ../ fonctionne (ne doit pas arriver mais bon..) if ((opt.debug>1) && (opt.log!=NULL)) { fspc(opt.log,"debug"); fprintf(opt.log,"link modified with code/codebase %s"LF,lien); test_flush; } } else { error=1; // erreur if (opt.errlog) { fspc(opt.errlog,"error"); fprintf(opt.errlog,"Link %s too long with base href"LF,lien); test_flush; } } } } } } } // transformer lien quelconque (http, relatif, etc) en une adresse // et un chemin+fichier (adr,fil) if (!error) { int reponse; if ((opt.debug>1) && (opt.log!=NULL)) { fspc(opt.log,"debug"); fprintf(opt.log,"build relative link %s with %s%s"LF,lien,urladr,urlfil); test_flush; } if ((reponse=ident_url_relatif(lien,urladr,urlfil,adr,fil))<0) { adr[0]='\0'; // erreur if (reponse==-2) { if (opt.errlog) { fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Link %s not caught (unknown ftp:// protocol)"LF,lien); test_flush; } } } } else { if ((opt.debug>1) && (opt.log!=NULL)) { fspc(opt.log,"debug"); fprintf(opt.log,"link %s not build, error detected before"LF,lien); test_flush; } adr[0]='\0'; } #if HTS_CHECK_STRANGEDIR // !ATTENTION! // Ici on teste les exotiques du genre www.truc.fr/machin (sans slash α la fin) // je n'ai pas encore trouvΘ le moyen de faire la diffΘrence entre un rΘpertoire // et un fichier en http A PRIORI : je fais donc un test // En cas de moved xxx, on recalcule adr et fil, tout simplement // DEFAUT: test effectuΘ plusieurs fois! α revoir!!! if ((adr[0]!='\0') && (strcmp(adr,"file://") && (p_type!=2) && (p_type!=-2)) { //## if ((adr[0]!='\0') && (adr[0]!=lOCAL_CHAR) && (p_type!=2) && (p_type!=-2)) { if (fil[strlen(fil)-1]!='/') { // pas rΘpertoire if (ishtml(fil)==-2) { // pas d'extension char loc[HTS_URLMAXSIZE*2]; // Θventuelle nouvelle position loc[0]='\0'; if ((opt.debug>1) && (opt.log!=NULL)) { fspc(opt.log,"debug"); fprintf(opt.log,"link-check-directory: %s%s"LF,adr,fil); test_flush; } // tester Θventuelle nouvelle position switch (http_location(adr,fil,loc).statuscode) { case 200: // ok au final if (strnotempty(loc)) { // a changΘ d'adresse if (opt.errlog) { fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Link %s%s has moved to %s for %s%s"LF,adr,fil,loc,urladr,urlfil); test_flush; } // recalculer adr et fil! if (ident_url(loc,adr,fil)==-1) { adr[0]='\0'; // cancel if ((opt.debug>1) && (opt.log!=NULL)) { fspc(opt.log,"debug"); fprintf(opt.log,"link-check-dir: %s%s"LF,adr,fil); test_flush; } } } break; case -2: case -3: // timeout ou erreur grave if (opt.errlog) { fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Connection too slow for testing link %s%s (from %s%s)"LF,adr,fil,urladr,urlfil); test_flush; } break; } } } } #endif // Le lien doit juste Ωtre rΘΘcrit, mais ne doit pas gΘnΘrer un lien // exemple: <FORM ACTION="url_cgi"> if (p_nocatch) { forbidden_url=1; // interdire rΘcupΘration du lien if ((opt.debug>1) && (opt.log!=NULL)) { fspc(opt.log,"debug"); fprintf(opt.log,"link ignored at %s%s"LF,adr,fil); test_flush; } } // Tester si un lien doit Ωtre acceptΘ ou refusΘ (wizard) // forbidden_url=1 : lien refusΘ // forbidden_url=0 : lien acceptΘ //if ((ptr>0) && (p_type!=2) && (p_type!=-2)) { // tester autorisations? if ((p_type!=2) && (p_type!=-2)) { // tester autorisations? if (!p_nocatch) { if (adr[0]!='\0') { if ((opt.debug>1) && (opt.log!=NULL)) { fspc(opt.log,"debug"); fprintf(opt.log,"wizard link test at %s%s.."LF,adr,fil); test_flush; } forbidden_url=hts_acceptlink(&opt,ptr,lien_tot,liens, adr,fil, filters,&filptr,filter_max, &robots, &set_prio_to, &just_test_it); if ((opt.debug>1) && (opt.log!=NULL)) { fspc(opt.log,"debug"); fprintf(opt.log,"result for wizard link test: %d"LF,forbidden_url); test_flush; } } } } // calculer meme_adresse meme_adresse=strfield2(adr,urladr); // DΘbut partie sauvegarde // ici on forme le nom du fichier α sauver, et on patche l'URL if (adr[0]!='\0') { // savename: simplifier les ../ et autres joyeusetΘs char save[HTS_URLMAXSIZE*2]; int r_sv=0; // En cas de moved, adresse premiΦre char former_adr[HTS_URLMAXSIZE*2]; char former_fil[HTS_URLMAXSIZE*2]; // save[0]='\0'; former_adr[0]='\0'; former_fil[0]='\0'; // // nom du chemin α sauver si on doit le calculer // note: url_savename peut dΘcider de tester le lien si il le trouve // suspect, et modifier alors adr et fil // dans ce cas on aura une rΘfΘrence directe au lieu des traditionnels // moved en cascade (impossible α reproduire α priori en local, lorsque des fichiers // gif sont impliquΘs par exemple) if ((p_type!=2) && (p_type!=-2)) { // pas base href ou codebase if (forbidden_url!=1) { char last_adr[HTS_URLMAXSIZE*2]; last_adr[0]='\0'; //char last_fil[HTS_URLMAXSIZE*2]=""; strcpy(last_adr,adr); // ancienne adresse //strcpy(last_fil,fil); // ancien chemin r_sv=url_savename(adr,fil,save,former_adr,former_fil,liens[ptr]->adr,liens[ptr]->fil,&opt,liens,lien_tot,back,back_max,&cache,&hash,ptr,numero_passe); if (strcmp(last_adr,adr) != 0) { // a changΘ // 2e test si moved // Tester si un lien doit Ωtre acceptΘ ou refusΘ (wizard) // forbidden_url=1 : lien refusΘ // forbidden_url=0 : lien acceptΘ if ((ptr>0) && (p_type!=2) && (p_type!=-2)) { // tester autorisations? if (!p_nocatch) { if (adr[0]!='\0') { if ((opt.debug>1) && (opt.log!=NULL)) { fspc(opt.log,"debug"); fprintf(opt.log,"wizard moved link retest at %s%s.."LF,adr,fil); test_flush; } forbidden_url=hts_acceptlink(&opt,ptr,lien_tot,liens, adr,fil, filters,&filptr,filter_max, &robots, &set_prio_to, &just_test_it); if ((opt.debug>1) && (opt.log!=NULL)) { fspc(opt.log,"debug"); fprintf(opt.log,"result for wizard moved link retest: %d"LF,forbidden_url); test_flush; } } } } //import_done=1; // c'est un import! meme_adresse=0; // on a changΘ } } else { strcpy(save,""); // dummy } } if (r_sv!=-1) { // pas d'erreur, on continue /* log */ if ((opt.debug>1) && (opt.log!=NULL)) { fspc(opt.log,"debug"); if (forbidden_url!=1) { // le lien va Ωtre chargΘ if ((p_type==2) || (p_type==-2)) { // base href ou codebase, pas un lien fprintf(opt.log,"Code/Codebase: %s%s"LF,adr,fil); } else if ((opt.getmode & 4)==0) { fprintf(opt.log,"Record: %s%s -> %s"LF,adr,fil,save); } else { if (!ishtml(fil)) fprintf(opt.log,"Record after: %s%s -> %s"LF,adr,fil,save); else fprintf(opt.log,"Record: %s%s -> %s"LF,adr,fil,save); } } else fprintf(opt.log,"External: %s%s"LF,adr,fil); test_flush; } /* FIN log */ // Θcrire lien if ((p_type==2) || (p_type==-2)) { // base href ou codebase, sauter lastsaved=eadr-1+1; // sauter " } else if (forbidden_url==1) { // le lien ne sera pas chargΘ, rΘfΘrence externe! if ((opt.getmode & 1) && (ptr>0)) { if (p_type!=-1) { // pas que le nom de fichier (pas classe java) if (!opt.external) { if (!strstr(adr,"://")) { HT_ADD("http://"); } HT_ADD(adr); if (*fil!='/') HT_ADD("/"); HT_ADD(fil); // } else { // fichier/page externe, mais on veut gΘnΘrer une erreur // int patch_it=0; int add_url=0; char* cat_name=NULL; char* cat_data=NULL; int cat_data_len=0; // ajouter lien external switch ((fil[strlen(fil)-1]=='/')?1:(ishtml(fil))) { case 1: case -2: // html ou rΘpertoire if (opt.getmode & 1) { // sauver html patch_it=1; // redirect add_url=1; // avec link? cat_name="external.html"; cat_data=HTS_DATA_UNKNOWN_HTML; cat_data_len=HTS_DATA_UNKNOWN_HTML_LEN; } break; default: // inconnu if ( (strfield2(fil+strlen(fil)-4,".gif")) || (strfield2(fil+strlen(fil)-4,".jpg")) || (strfield2(fil+strlen(fil)-4,".xbm")) || (ishtml(fil)!=0) ) { patch_it=1; // redirect add_url=1; // avec link aussi cat_name="external.gif"; cat_data=HTS_DATA_UNKNOWN_GIF; cat_data_len=HTS_DATA_UNKNOWN_GIF_LEN; } break; }// html,gif if (patch_it) { char save[HTS_URLMAXSIZE*2]; char tempo[HTS_URLMAXSIZE*2]; strcpy(save,opt.path_html); strcat(save,cat_name); if (lienrelatif(tempo,save,savename)==0) { HT_ADD(tempo); // page externe if (add_url) { HT_ADD("?link="); // page externe HT_ADD(adr); if (*fil!='/') HT_ADD("/"); HT_ADD(fil); } } // Θcrire fichier? if (!fexist(fconcat(opt.path_html,cat_name))) { FILE* fp = filecreate(fconcat(opt.path_html,cat_name)); if (fp) { if (cat_data_len==0) { // texte verif_backblue(opt.path_html); fprintf(fp,"%s%s",""LF,cat_data); } else { // data fwrite(cat_data,cat_data_len,1,fp); } fclose(fp); usercommand(0,NULL,fconcat(opt.path_html,cat_name)); } } } else { // Θcrire normalement le nom de fichier HT_ADD("http://"); HT_ADD(adr); if (*fil!='/') HT_ADD("/"); HT_ADD(fil); }// patcher? } // external } else { // que le nom de fichier (classe java) // en gros recopie de plus bas: copier codebase et base if (p_flush) { char tempo[HTS_URLMAXSIZE*2]; // <-- ajoutΘ char tempo_pat[HTS_URLMAXSIZE*2]; tempo_pat[0]='\0'; strcpy(tempo,fil); // <-- ajoutΘ { char* a=tempo+strlen(tempo)-1; while( ((int) a > (int) tempo) && (*a) && (*a!='/')) a--; if (*a=='/') { char tempo2[HTS_URLMAXSIZE*2]; strcpy(tempo2,a+1); strncat(tempo_pat,tempo,(int) a-(int) tempo+1); // chemin strcpy(tempo,tempo2); // fichier } } // Θrire codebase="chemin" if ((opt.getmode & 1) && (ptr>0)) { char tempo4[HTS_URLMAXSIZE*2]; tempo4[0]='\0'; if (strnotempty(tempo_pat)) { HT_ADD("codebase=\"http://"); HT_ADD(adr); if (*tempo_pat!='/') HT_ADD("/"); HT_ADD(tempo_pat); HT_ADD("\" "); } strncat(tempo4,lastsaved,(int) p_flush-(int) lastsaved); HT_ADD(tempo4); // refresh code=" HT_ADD(tempo); } } } } lastsaved=eadr-1; } /* else if (opt.urlmode==1) { // ABSOLU, c'est le cas le moins courant // NE FONCTIONNE PAS!! (et est inutile) if ((opt.getmode & 1) && (ptr>0)) { // ecrire les html // Θcrire le lien modifiΘ, absolu HT_ADD("file:"); if (*save=='/') HT_ADD(save+1) else HT_ADD(save) } lastsaved=eadr-1; // dernier Θcrit+1 (enfin euh apres on fait un ++ alors hein) } */ else if (opt.urlmode==3) { // URI absolue / if ((opt.getmode & 1) && (ptr>0)) { // ecrire les html HT_ADD(fil); } lastsaved=eadr-1; // dernier Θcrit+1 (enfin euh apres on fait un ++ alors hein) } else if (opt.urlmode==2) { // RELATIF char tempo[HTS_URLMAXSIZE*2]; tempo[0]='\0'; // calculer le lien relatif if (lienrelatif(tempo,save,savename)==0) { if ((opt.debug>1) && (opt.log!=NULL)) { fspc(opt.log,"debug"); fprintf(opt.log,"relative link at %s build with %s and %s: %s"LF,adr,save,savename,tempo); test_flush; } // lien applet (code) - il faut placer un codebase avant if (p_type==-1) { // que le nom de fichier if (p_flush) { char tempo_pat[HTS_URLMAXSIZE*2]; tempo_pat[0]='\0'; { char* a=tempo+strlen(tempo)-1; while( ((int) a > (int) tempo) && (*a) && (*a!='/')) a--; if (*a=='/') { char tempo2[HTS_URLMAXSIZE*2]; strcpy(tempo2,a+1); strncat(tempo_pat,tempo,(int) a-(int) tempo+1); // chemin strcpy(tempo,tempo2); // fichier } } // Θrire codebase="chemin" if ((opt.getmode & 1) && (ptr>0)) { char tempo4[HTS_URLMAXSIZE*2]; tempo4[0]='\0'; if (strnotempty(tempo_pat)) { HT_ADD("codebase=\""); HT_ADD(tempo_pat); HT_ADD("\" "); } strncat(tempo4,lastsaved,(int) p_flush-(int) lastsaved); HT_ADD(tempo4); // refresh code=" } } //lastsaved=adr; // dernier Θcrit+1 } if ((opt.getmode & 1) && (ptr>0)) { // Θcrire le lien modifiΘ, relatif HT_ADD(tempo); } lastsaved=eadr-1; // dernier Θcrit+1 (enfin euh apres on fait un ++ alors hein) } else { if (opt.errlog) { fprintf(opt.errlog,"Error building relative link %s and %s"LF,save,savename); test_flush; } } } // sinon le lien sera Θcrit normalement #if 0 if (fexist(save)) { // le fichier existe.. adr[0]='\0'; //if ((opt.debug>0) && (opt.log!=NULL)) { if (opt.errlog) { fspc(opt.errlog,"warning"); fprintf(opt.errlog,"Link has already been written on disk, cancelled: %s"LF,save); test_flush; } } #endif if ((adr[0]!='\0') && (p_type!=2) && (p_type!=-2) && ( (forbidden_url!=1) || (just_test_it))) { // si le fichier n'existe pas, ajouter α la liste // n'y a-t-il pas trop de liens? if (lien_tot+1 >= lien_max-4) { // trop de liens! printf("PANIC! : Too many URLs : >%d [%d]\n",lien_tot,__LINE__); if (opt.errlog) { fprintf(opt.errlog,LF"Too many URLs, giving up..(>%d)"LF,lien_max); fprintf(opt.errlog,"To avoid that: use #L option for more links (example: -#L1000000)"LF); test_flush; } if ((opt.getmode & 1) && (ptr>0)) { if (fp) { fclose(fp); fp=NULL; } } XH_uninit; // dΘsallocation mΘmoire & buffers return 0; } else { // noter le lien sur la listes des liens α charger int pass_fix,dejafait=0; // Calculer la prioritΘ de ce lien if ((opt.getmode & 4)==0) { // traiter html aprΦs pass_fix=0; } else { // vΘrifier que ce n'est pas un !html if (!ishtml(fil)) pass_fix=1; // prioritΘ infΘrieure (traiter aprΦs) else pass_fix=max(0,numero_passe); // prioritΘ normale } // vΘrifier que le lien n'a pas dΘja ΘtΘ notΘ // si c'est le cas, alors il faut s'assurer que la prioritΘ associΘe // au fichier est la plus grande des deux prioritΘs // // On part de la fin et on essaye de se presser (Θconomise temps machine) #if HTS_HASH { int i=hash_read(&hash,save,"",0); // lecture type 0 (sav) if (i>=0) { liens[i]->depth=maximum(liens[i]->depth,liens[ptr]->depth-1); dejafait=1; } } #else { register int l; register int i; l=strlen(save); // opti for(i=lien_tot-1;(i>=0) && (dejafait==0);i--) { if (liens[i]->sav_len==l) { // mΩme taille de chaεne if (strcmp(liens[i]->sav,save)==0) { // existe dΘja liens[i]->depth=maximum(liens[i]->depth,liens[ptr]->depth-1); dejafait=1; } } } } #endif // le lien n'a jamais ΘtΘ crΘΘ. // cette fois ci, on le crΘe! if (!dejafait) { // // >>>> CREER LE LIEN <<<< // // enregistrer lien α charger //liens[lien_tot]->adr[0]=liens[lien_tot]->fil[0]=liens[lien_tot]->sav[0]='\0'; // mΩme adresse: l'objet pΦre est l'objet pΦre de l'actuel // DEBUT ROBOTS.TXT AJOUT if (!just_test_it) { if (strfield(adr,"ftp://")==0) { // non ftp if (opt.robots) { // rΘcupΘrer robots if (ishtml(fil)!=0) { // pas la peine pour des fichiers isolΘs if (checkrobots(&robots,adr,"") != -1) { // robots.txt ? checkrobots_set(&robots,adr,""); // ajouter entrΘe vide if (checkrobots(&robots,adr,"") == -1) { // robots.txt ? // enregistrer robots.txt (MACRO) liens_record(adr,"/robots.txt","","",""); if (liens[lien_tot]==NULL) { // erreur, pas de place rΘservΘe printf("PANIC! : Not enough memory [%d]\n",__LINE__); if (opt.errlog) { fprintf(opt.errlog,"Not enough memory, can not re-allocate %d bytes"LF,(add_tab_alloc+1)*sizeof(lien_url)); test_flush; } if ((opt.getmode & 1) && (ptr>0)) { if (fp) { fclose(fp); fp=NULL; } } XH_uninit; // dΘsallocation mΘmoire & buffers return 0; } liens[lien_tot]->testmode=0; // pas mode test liens[lien_tot]->link_import=0; // pas mode import liens[lien_tot]->premier=lien_tot; liens[lien_tot]->precedent=ptr; liens[lien_tot]->depth=0; liens[lien_tot]->pass2=max(0,numero_passe); liens[lien_tot]->retry=0; lien_tot++; // UN LIEN DE PLUS #if DEBUG_ROBOTS printf("robots.txt: added file robots.txt for %s\n",adr); #endif if ((opt.debug>1) && (opt.log!=NULL)) { fspc(opt.log,"debug"); fprintf(opt.log,"robots.txt added at %s"LF,adr); test_flush; } } else { if (opt.errlog) { fprintf(opt.errlog,"Unexpected robots.txt error at %d"LF,__LINE__); test_flush; } } } } } } } // FIN ROBOTS.TXT AJOUT // enregistrer (MACRO) liens_record(adr,fil,save,former_adr,former_fil); if (liens[lien_tot]==NULL) { // erreur, pas de place rΘservΘe printf("PANIC! : Not enough memory [%d]\n",__LINE__); if (opt.errlog) { fprintf(opt.errlog,"Not enough memory, can not re-allocate %d bytes"LF,(add_tab_alloc+1)*sizeof(lien_url)); test_flush; } if ((opt.getmode & 1) && (ptr>0)) { if (fp) { fclose(fp); fp=NULL; } } XH_uninit; // dΘsallocation mΘmoire & buffers return 0; } // mode test? if (!just_test_it) liens[lien_tot]->testmode=0; // pas mode test else liens[lien_tot]->testmode=1; // mode test if (!import_done) liens[lien_tot]->link_import=0; // pas mode import else liens[lien_tot]->link_import=1; // mode import // Θcrire autres paramΦtres de la structure-lien if ((meme_adresse) && (!import_done) && (liens[ptr]->premier != 0)) liens[lien_tot]->premier=liens[ptr]->premier; else // sinon l'objet pΦre est le prΘcΘdent lui mΩme liens[lien_tot]->premier=lien_tot; // liens[lien_tot]->premier=ptr; liens[lien_tot]->precedent=ptr; // noter la prioritΘ if (!set_prio_to) liens[lien_tot]->depth=liens[ptr]->depth-1; else liens[lien_tot]->depth=max(0,min(liens[ptr]->depth-1,set_prio_to-1)); // PRIORITE NULLE (catch page) // noter pass liens[lien_tot]->pass2=pass_fix; liens[lien_tot]->retry=opt.retry; //strcpy(liens[lien_tot]->adr,adr); //strcpy(liens[lien_tot]->fil,fil); //strcpy(liens[lien_tot]->sav,save); if ((opt.debug>1) && (opt.log!=NULL)) { if (!just_test_it) { fspc(opt.log,"debug"); fprintf(opt.log,"OK, NOTE: %s%s -> %s"LF,liens[lien_tot]->adr,liens[lien_tot]->fil,liens[lien_tot]->sav); } else { fspc(opt.log,"debug"); fprintf(opt.log,"OK, TEST: %s%s"LF,liens[lien_tot]->adr,liens[lien_tot]->fil); } test_flush; } lien_tot++; // UN LIEN DE PLUS } else { // if !dejafait if ((opt.debug>1) && (opt.log!=NULL)) { fspc(opt.log,"debug"); fprintf(opt.log,"link has already been recorded, cancelled: %s"LF,save); test_flush; } } } // si pas trop de liens } // si adr[0]!='\0' } // if adr[0]!='\0' } // if adr[0]!='\0' } // if strlen(lien)>0 } // if ok==0 adr=eadr-1; // ** sauter } // if (p) } // si '<' ou '>' // plus loin adr++; // ---------- // Θcrire peu α peu if ((opt.getmode & 1) && (ptr>0)) HT_ADD_ADR; lastsaved=adr; // dernier Θcrit+1 // ---------- // pour les stats du shell si parsing trop long #if HTS_ANALYSTE==2 _hts_in_html_done=(100 * ((int) adr - (int) r.adr) ) / (int)(r.size); if (_hts_in_html_poll) { LLint nb; int nbk; _hts_in_html_poll=0; // temps α attendre, et remplir autant que l'on peut le cache (backing) back_wait(back,back_max,&opt,&cache,HTS_STAT.stat_timestart); back_fillmax(back,back_max,&opt,&cache,liens,ptr,numero_passe,lien_tot); engine_stats(); nb=back_transfered(HTS_STAT.stat_bytes,back,back_max); nbk=backlinks_done(liens,lien_tot,ptr); //if (!hts_htmlcheck_loop(back,back_max,-1,ptr,lien_tot,nb,new_stat_bytes,(int) (time_local()-stat_timestart),back_nsoc(back,back_max) )) { if (!hts_htmlcheck_loop(back,back_max,0,ptr,lien_tot,nb,new_stat_bytes,(int) (time_local()-HTS_STAT.stat_timestart),back_nsoc(back,back_max), HTS_STAT.stat_files,HTS_STAT.stat_updated_files,fspc(NULL,"error"),(int)HTS_STAT.rate,nbk )) { if (opt.errlog) { fspc(opt.errlog,"info"); fprintf(opt.errlog,"Exit requested by shell or user"LF); test_flush; } exit_xh=1; // exit requested XH_uninit; return 0; //adr = r.adr + r.size; // exit } else if (_hts_cancel==1) { adr = r.adr + r.size; // exit _hts_cancel=0; } } // refresh the backing system each 2 seconds if (engine_stats()) { back_wait(back,back_max,&opt,&cache,HTS_STAT.stat_timestart); back_fillmax(back,back_max,&opt,&cache,liens,ptr,numero_passe,lien_tot); } #endif } while(( ((int) adr) - ((int) r.adr) ) < r.size); #if HTS_ANALYSTE==2 _hts_in_html_parsing=0; // flag _hts_cancel=0; // pas de cancel #endif if ((opt.getmode & 1) && (ptr>0)) { HT_ADD_END; // achever } // // // } // if !error if (opt.getmode & 1) { if (fp) { fclose(fp); fp=NULL; } } // sauver fichier //structcheck(savename); //filesave(r.adr,r.size,savename); #if HTS_ANALYSTE } // analyse OK #endif